In [1]:
from IPython.display import display, HTML
# Set the notebook width to 80%
display(HTML("<style>.container { width: 80% !important; }</style>"))
Import dep¶
In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
import plotly
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "notebook"
In [4]:
COLOR_LIST = plotly.colors.DEFAULT_PLOTLY_COLORS
len(COLOR_LIST)
Out[4]:
In [5]:
print(plotly.__version__, plotly.__path__)
In [6]:
import notebook
import ipywidgets
print(notebook.__version__, notebook.__path__)
print(ipywidgets.__version__, ipywidgets.__path__)
In [7]:
from joblib import Parallel, delayed, parallel_backend
from itertools import product
In [8]:
def plot_perf_metrics(ls_rewards, ls_opt_act_flgs, ls_param, param_prefix='epsilon', plt_lib='mlp', fig=None, axes=None):
if isinstance(param_prefix, str):
param_prefix = [param_prefix for _ in range(len(ls_param))]
if plt_lib == 'mlp':
if fig is None or axes is None:
fig, axes = plt.subplots(2, 1, figsize=(10, 10))
for i, eps in enumerate(ls_param):
axes[0].plot(ls_rewards[i], label='{} = {}'.format(param_prefix[i], eps))
axes[1].plot(ls_opt_act_flgs[i], label='{} = {}'.format(param_prefix[i], eps))
if fig is None:
axes[0].set_title('Average Reward')
axes[0].set_xlabel('Steps')
axes[0].set_ylabel('Average Reward')
axes[0].legend()
axes[1].set_title('% Optimal Action')
axes[1].set_xlabel('Steps')
axes[1].set_ylabel('% Optimal Action')
axes[1].legend()
plt.show()
else:
# Create subplots with 2 rows and 1 column
if fig is None:
fig = make_subplots(rows=2, cols=1)
for i, eps in enumerate(ls_param):
# Add traces to the first subplot for rewards
fig.add_trace(go.Scatter(x=np.arange(ls_rewards[i].shape[0]),
y=ls_rewards[i],
mode='lines',
name='{} = {}'.format(param_prefix[i], eps),
line=dict(color=COLOR_LIST[i])), row=1, col=1)
# Add traces to the second subplot for optimal action rate
fig.add_trace(go.Scatter(x=np.arange(ls_opt_act_flgs[i].shape[0]),
y=ls_opt_act_flgs[i],
mode='lines',
# name='param = {}'.format(eps),
line=dict(color=COLOR_LIST[i]),
showlegend=False),
row=2, col=1)
if fig is None:
# Add axis titles
fig.update_xaxes(title_text='Steps', row=1, col=1)
fig.update_yaxes(title_text='Average Reward', row=1, col=1)
fig.update_xaxes(title_text='Steps', row=2, col=1)
fig.update_yaxes(title_text='% Optimal Action', row=2, col=1)
# Update layout
fig.update_layout(height=600, width=800, title_text='Subplots', hovermode='x')
# Show the figure
fig.show()
10 arms bandit¶
In [9]:
def k_bandit_sim_eps(k, n_steps, n_exps, epsilon, new_bandit=True, init_q_star=0, n_jobs=10):
q_star, opt_a = None, None
if not new_bandit:
# Initialize the q_star values
q_star = np.random.normal(0, 1, k)
opt_a = np.argmax(q_star)
def _one_sim(i, q_star, opt_a):
if new_bandit:
# Initialize the q_star values
q_star = np.random.normal(0, 1, k)
opt_a = np.argmax(q_star)
# Initialize the q values
q = np.ones(k)*init_q_star
# Initialize the number of times each action was taken
n = np.zeros(k)
rewards = np.zeros(n_steps)
opt_act_flgs = np.zeros(n_steps)
for j in range(n_steps):
# Choose an action
if np.random.rand() < epsilon:
a = np.random.randint(k)
else:
a = np.argmax(q)
# Get the reward
reward = np.random.normal(q_star[a], 1)
# Update the q values
n[a] += 1
q[a] += (reward - q[a]) / n[a]
# Store the reward
rewards[j] = reward
opt_act_flgs[j] = int(a == opt_a)
return rewards, opt_act_flgs
with parallel_backend('loky', n_jobs=n_jobs):
results = Parallel()(delayed(_one_sim)(i, q_star, opt_a) for i in range(n_exps))
rewards, opt_act_flgs = zip(*results)
rewards = np.mean(np.array(rewards), axis=0)
opt_act_flgs = np.mean(np.array(opt_act_flgs), axis=0)
return rewards, opt_act_flgs
New bandit¶
- The plot doesn't depend on the randomization of initial q_star too much.
In [10]:
k_arms, n_steps, n_exps = 10, 1000, 2000
ls_epsilon = [0, 0.01, 0.1, 0.3]
ls_rewards = []
ls_opt_act_flgs = []
for eps in ls_epsilon:
rewards, opt_act_flgs = k_bandit_sim_eps(k_arms, n_steps, n_exps, eps)
ls_rewards.append(rewards)
ls_opt_act_flgs.append(opt_act_flgs)
In [11]:
plot_perf_metrics(ls_rewards, ls_opt_act_flgs, ls_epsilon, plt_lib='plotly')
Same bandit¶
- The plot results will heavily depend on the inital randomization of the q_star.
In [12]:
k_arms, n_steps, n_exps = 10, 1000, 2000
ls_epsilon = [0, 0.01, 0.1, 0.3]
ls_rewards = []
ls_opt_act_flgs = []
for eps in ls_epsilon:
rewards, opt_act_flgs = k_bandit_sim_eps(k_arms, n_steps, n_exps, eps, new_bandit=False)
ls_rewards.append(rewards)
ls_opt_act_flgs.append(opt_act_flgs)
In [13]:
plot_perf_metrics(ls_rewards, ls_opt_act_flgs, ls_epsilon, plt_lib='plotly')
The mean value of the maximum of k normal RV with mean 0 and std 1¶
- This is the best possible value of long-run k-arms bandit problem.
- In the bandit, when the maximum arm samples a maximum value at the first time, it won't guarantee that it will eventually choose the maximum arm, b/c later sampling can result it in a lower value than others.
In [14]:
max_k_rvs = np.zeros(n_exps)
max_q_star_first_realization_max_flgs = np.zeros(n_exps)
for i in range(n_exps):
q_stars = np.random.normal(0, 1, k_arms)
max_k_rvs[i] = max(q_stars)
q_vals = np.array([np.random.normal(q_stars[j], 1) for j in range(k_arms)])
max_q_star_first_realization_max_flgs[i] = int(np.argmax(q_vals) == np.argmax(q_stars))
print(np.mean(max_k_rvs), np.mean(max_q_star_first_realization_max_flgs))
Ex 2.3¶
- For $\epsilon=0.1$, the reward should be
In [15]:
np.round(np.mean(max_k_rvs)*0.9, 2), np.round(np.mean(max_k_rvs)*0.99, 2)
Out[15]:
Initial value effect¶
- Large(optimistic) initial value encourage exploration.
In [16]:
k_arms, n_steps, n_exps = 10, 1000, 2000
ls_epsilon = [0, 0.01, 0.1, 0.3]
ls_rewards = []
ls_opt_act_flgs = []
for eps in ls_epsilon:
rewards, opt_act_flgs = k_bandit_sim_eps(k_arms, n_steps, n_exps, eps, new_bandit=True, init_q_star=5)
ls_rewards.append(rewards)
ls_opt_act_flgs.append(opt_act_flgs)
In [17]:
plot_perf_metrics(ls_rewards, ls_opt_act_flgs, ls_epsilon, plt_lib='plotly')
Ex 2.5¶
- The constant-step-size scheme gives a better % in getting the optimal action.
In [18]:
def k_bandit_sim_eps_nonstationary(k, n_steps, n_exps, epsilon, init_q_star=0, rw_sig=0.1, val_scheme='sample_avg', alpha=0.1):
def _one_sim(i):
# Initialize the q_star values
q_star = np.random.normal(0, 1, k)
opt_a = np.argmax(q_star)
# Initialize the q values
q = np.ones(k)*init_q_star
# Initialize the number of times each action was taken
n = np.zeros(k)
rewards = np.zeros(n_steps)
opt_act_flgs = np.zeros(n_steps)
for j in range(n_steps):
if rw_sig > 0:
q_star += np.random.normal(0, rw_sig, k)
opt_a = np.argmax(q_star)
# Choose an action
if np.random.rand() < epsilon:
a = np.random.randint(k)
else:
a = np.argmax(q)
# Get the reward
reward = np.random.normal(q_star[a], 1)
# Update the q values
if val_scheme == 'sample_avg':
n[a] += 1
q[a] += (reward - q[a]) / n[a]
else: # constant step-size
q[a] += alpha*(reward - q[a])
# Store the reward
rewards[j] = reward
opt_act_flgs[j] = int(a == opt_a)
return rewards, opt_act_flgs
with parallel_backend('loky', n_jobs=10):
results = Parallel()(delayed(_one_sim)(i) for i in range(n_exps))
rewards, opt_act_flgs = zip(*results)
rewards = np.mean(np.array(rewards), axis=0)
opt_act_flgs = np.mean(np.array(opt_act_flgs), axis=0)
return rewards, opt_act_flgs
Use sample avg as the valuation scheme¶
In [19]:
k_arms, n_steps, n_exps = 10, 10000, 2000
ls_epsilon = [0, 0.01, 0.1, 0.3]
ls_rewards = []
ls_opt_act_flgs = []
for eps in ls_epsilon:
rewards, opt_act_flgs = k_bandit_sim_eps_nonstationary(k_arms, n_steps, n_exps, eps)
ls_rewards.append(rewards)
ls_opt_act_flgs.append(opt_act_flgs)
In [20]:
plot_perf_metrics(ls_rewards, ls_opt_act_flgs, ls_epsilon, plt_lib='plotly')
Use the constant step size as the valuation scheme¶
In [21]:
k_arms, n_steps, n_exps = 10, 10000, 2000
ls_epsilon = [0, 0.01, 0.1, 0.3]
ls_rewards = []
ls_opt_act_flgs = []
for eps in ls_epsilon:
rewards, opt_act_flgs = k_bandit_sim_eps_nonstationary(k_arms, n_steps, n_exps, eps, val_scheme='constant_step_size', alpha=0.1)
ls_rewards.append(rewards)
ls_opt_act_flgs.append(opt_act_flgs)
In [22]:
plot_perf_metrics(ls_rewards, ls_opt_act_flgs, ls_epsilon, plt_lib='plotly')
Ex 2.7¶
\begin{align*} Q_{n+1} &= Q_n + \frac{\alpha }{\bar{o}_n} (R_n - Q_n) \\ &= \frac{\alpha}{\bar{o}_n} R_n + \left(1 - \frac{\alpha}{\bar{o}_n}\right) Q_n \\ &= \frac{\alpha}{\bar{o}_n} R_n + \left(1 - \frac{\alpha}{\bar{o}_n}\right) \left(\frac{\alpha}{\bar{o}_{n-1}} R_{n-1} + \left(1 - \frac{\alpha}{\bar{o}_{n-1}}\right) Q_{n-1}\right) \\ &= \sum_{i=1}^{n} \frac{\alpha R_i}{\bar{o}_i} \prod_{j=i+1}^{n} \left(1 - \frac{\alpha}{\bar{o}_j}\right) + Q_1 \prod_{i=1}^{n} \left(1 - \frac{\alpha}{\bar{o}_i}\right) \\ \bar{o}_n &= 1-(1-\alpha)^n \end{align*}
Ex 2.8¶
In [23]:
SAMPLE_AVG = 'sample_avg'
CONSTANT_STEP_SIZE = 'constant_step_size'
def k_bandit_sim_ucb(k, n_steps, n_exps, init_q_star=0, val_scheme='sample_avg', alpha=0.1, ucb_c=1):
def _one_sim(i):
# Initialize the q_star values
q_star = np.random.normal(0, 1, k)
opt_a = np.argmax(q_star)
# Initialize the q values
q = np.ones(k)*init_q_star
# Initialize the number of times each action was taken
n = np.zeros(k)
rewards = np.zeros(n_steps)
opt_act_flgs = np.zeros(n_steps)
for j in range(n_steps):
# Choose an action
if ucb_c>=0:
ucb = q + ucb_c*np.sqrt(np.log(j+1)/(n+1e-5))
a = np.argmax(ucb)
else:
a = np.argmax(q)
# Get the reward
reward = np.random.normal(q_star[a], 1)
# Update the q values
if val_scheme == 'sample_avg':
n[a] += 1
q[a] += (reward - q[a]) / n[a]
else: # constant step-size
q[a] += alpha*(reward - q[a])
# Store the reward
rewards[j] = reward
opt_act_flgs[j] = int(a == opt_a)
return rewards, opt_act_flgs
with parallel_backend('loky', n_jobs=10):
results = Parallel()(delayed(_one_sim)(i) for i in range(n_exps))
rewards, opt_act_flgs = zip(*results)
rewards = np.mean(np.array(rewards), axis=0)
opt_act_flgs = np.mean(np.array(opt_act_flgs), axis=0)
return rewards, opt_act_flgs
In [24]:
# before parallelization, it took 7 minutes
ls_ucb_c = [0, 1, 2, 5, 10]
epsilon = 0.1
k_arms, n_steps, n_exps = 10, 5000, 2000
ls_rewards = []
ls_opt_act_flgs = []
rewards, opt_act_flgs = k_bandit_sim_eps(k_arms, n_steps, n_exps, epsilon)
ls_rewards.append(rewards)
ls_opt_act_flgs.append(opt_act_flgs)
for ucb_c in ls_ucb_c:
rewards, opt_act_flgs = k_bandit_sim_ucb(k_arms, n_steps, n_exps, ucb_c=ucb_c)
ls_rewards.append(rewards)
ls_opt_act_flgs.append(opt_act_flgs)
In [25]:
param_prefix = ['epsilon'] + ['ucb_c']*len(ls_ucb_c)
ls_param = [epsilon] + ls_ucb_c
plot_perf_metrics(ls_rewards, ls_opt_act_flgs, ls_param, param_prefix=param_prefix, plt_lib='plotly')
Gradient Bandit¶
In [26]:
def k_bandit_sim_gradient(k, n_steps, n_exps, alpha=0.1, baseline=True, init_h=0):
def _one_sim(i):
# Initialize the q_star values
q_star = np.random.normal(0, 1, k)
opt_a = np.argmax(q_star)
# Initialize the avg reward
avg_reward = 0
# Initialize the preferences
h = np.ones(k) * init_h
sum_h = np.sum(h)
rewards = np.zeros(n_steps)
opt_act_flgs = np.zeros(n_steps)
for j in range(n_steps):
# Choose an action
pi = np.exp(h) / np.sum(np.exp(h))
a = np.random.choice(k, p=pi)
# Get the reward
reward = np.random.normal(q_star[a], 1)
# Update the avg reward
avg_reward += (reward - avg_reward)/(j+1)
# Update the preferences
one_hot = np.zeros(k)
one_hot[a] = 1
bv = avg_reward if baseline else 0
h += alpha*(reward - bv)*(one_hot - pi)
# # Make sure h doesn't change the sum val
# h -= (np.sum(h) - sum_h)/k
# Store the reward
rewards[j] = reward
opt_act_flgs[j] = int(a == opt_a)
return rewards, opt_act_flgs
with parallel_backend('loky', n_jobs=10):
results = Parallel()(delayed(_one_sim)(i) for i in range(n_exps))
rewards, opt_act_flgs = zip(*results)
rewards = np.mean(np.array(rewards), axis=0)
opt_act_flgs = np.mean(np.array(opt_act_flgs), axis=0)
return rewards, opt_act_flgs
In [27]:
k_arms, n_steps, n_exps = 10, 1000, 2000
ls_alpha = [0.1, 0.4]
ls_baseline = [True, False]
ls_rewards = []
ls_opt_act_flgs = []
for alpha, baseline in product(ls_alpha, ls_baseline):
rewards, opt_act_flgs = k_bandit_sim_gradient(k_arms, n_steps, n_exps, alpha=alpha, baseline=baseline)
ls_rewards.append(rewards)
ls_opt_act_flgs.append(opt_act_flgs)
In [28]:
ls_param = list(product(ls_alpha, ls_baseline))
ls_param = ['{}, {}'.format(alpha, baseline) for alpha, baseline in ls_param]
plot_perf_metrics(ls_rewards, ls_opt_act_flgs, ls_param, param_prefix='alpha, baseline', plt_lib='plotly')
Ex 2.11¶
In [29]:
k_arms, n_steps, n_exps = 10, 20000, 500
# epsilon-greedy
ls_epsilon = [2.0**i for i in np.arange(-7, -2, .5)]
ls_eps_rewards = []
ls_eps_opt_act_flgs = []
for epsilon in ls_epsilon:
rewards, opt_act_flgs = k_bandit_sim_eps(k_arms, n_steps, n_exps, epsilon)
ls_eps_rewards.append(rewards)
ls_eps_opt_act_flgs.append(opt_act_flgs)
# espilon-greedy with optimistic initialization
ls_opt_init = [2.0**i for i in np.arange(-5, 3, .5)]
ls_eps_opt_init_rewards = []
ls_eps_opt_init_opt_act_flgs = []
for init_q_star in ls_opt_init:
rewards, opt_act_flgs = k_bandit_sim_eps(k_arms, n_steps, n_exps, 0.1, init_q_star=init_q_star)
ls_eps_opt_init_rewards.append(rewards)
ls_eps_opt_init_opt_act_flgs.append(opt_act_flgs)
# UCB
ls_ucb_c = [2.0**i for i in np.arange(-6, 3, .5)]
ls_ubc_rewards = []
ls_ubc_opt_act_flgs = []
for ucb_c in ls_ucb_c:
rewards, opt_act_flgs = k_bandit_sim_ucb(k_arms, n_steps, n_exps, ucb_c=ucb_c)
ls_ubc_rewards.append(rewards)
ls_ubc_opt_act_flgs.append(opt_act_flgs)
# Gradient
ls_alpha = [2.0**i for i in np.arange(-7, 3, .5)]
ls_gradient_rewards = []
ls_gradient_opt_act_flgs = []
for alpha in ls_alpha:
rewards, opt_act_flgs = k_bandit_sim_gradient(k_arms, n_steps, n_exps, alpha=alpha)
ls_gradient_rewards.append(rewards)
ls_gradient_opt_act_flgs.append(opt_act_flgs)
In [30]:
fig = make_subplots(rows=2, cols=1)
ls_avg_rewards = []
ls_avg_opt_act_flgs = []
for i, (ls_rewards, ls_opt_act_flgs, ls_param, method_name) in enumerate([
(ls_eps_rewards, ls_eps_opt_act_flgs, ls_epsilon, 'epsilon'),
(ls_eps_opt_init_rewards, ls_eps_opt_init_opt_act_flgs, ls_opt_init, 'epsilon_opt_init'),
(ls_ubc_rewards, ls_ubc_opt_act_flgs, ls_ucb_c, 'ucb'),
(ls_gradient_rewards, ls_gradient_opt_act_flgs, ls_alpha, 'gradient')]
):
avg_rewards = np.mean(np.array(ls_rewards)[:, :n_steps//2], axis=1)
avg_opt_act_flgs = np.mean(np.array(ls_opt_act_flgs)[:, :n_steps//2], axis=1)
ls_avg_rewards.append(avg_rewards)
ls_avg_opt_act_flgs.append(avg_opt_act_flgs)
text = [None for _ in range(len(ls_param))]
text[len(ls_param)//2] = method_name
fig.add_trace(
go.Scatter(
x=ls_param, y=avg_rewards, mode='lines+text', line=dict(color=COLOR_LIST[i]), name=f"{method_name}",
text=text, textposition='top center', textfont=dict(color=COLOR_LIST[i])
),
row=1, col=1)
fig.add_trace(
go.Scatter(
x=ls_param, y=avg_opt_act_flgs, mode='lines+text', line=dict(color=COLOR_LIST[i]), name=f"{method_name}",
text=text, textposition='top center', textfont=dict(color=COLOR_LIST[i]),
showlegend=False
),
row=2, col=1)
fig.update_xaxes(title_text='Parameter', row=1, col=1)
fig.update_yaxes(title_text='Average Reward', row=1, col=1)
fig.update_xaxes(title_text='Parameter', row=2, col=1)
fig.update_yaxes(title_text='% Optimal Action', row=2, col=1)
fig.update_layout(height=800, width=800, title_text='First half avg rewards', hovermode='x', xaxis_type='log', xaxis2_type='log')
fig.show()
In [31]:
fig = make_subplots(rows=2, cols=1)
ls_avg_rewards = []
ls_avg_opt_act_flgs = []
for i, (ls_rewards, ls_opt_act_flgs, ls_param, method_name) in enumerate([
(ls_eps_rewards, ls_eps_opt_act_flgs, ls_epsilon, 'epsilon'),
(ls_eps_opt_init_rewards, ls_eps_opt_init_opt_act_flgs, ls_opt_init, 'epsilon_opt_init'),
(ls_ubc_rewards, ls_ubc_opt_act_flgs, ls_ucb_c, 'ucb'),
(ls_gradient_rewards, ls_gradient_opt_act_flgs, ls_alpha, 'gradient')]
):
avg_rewards = np.mean(np.array(ls_rewards)[:, n_steps//2:], axis=1)
avg_opt_act_flgs = np.mean(np.array(ls_opt_act_flgs)[:, n_steps//2:], axis=1)
ls_avg_rewards.append(avg_rewards)
ls_avg_opt_act_flgs.append(avg_opt_act_flgs)
text = [None for _ in range(len(ls_param))]
text[len(ls_param)//2] = method_name
fig.add_trace(
go.Scatter(
x=ls_param, y=avg_rewards, mode='lines+text', line=dict(color=COLOR_LIST[i]), name=f"{method_name}",
text=text, textposition='top center', textfont=dict(color=COLOR_LIST[i])
),
row=1, col=1)
fig.add_trace(
go.Scatter(
x=ls_param, y=avg_opt_act_flgs, mode='lines+text', line=dict(color=COLOR_LIST[i]), name=f"{method_name}",
text=text, textposition='top center', textfont=dict(color=COLOR_LIST[i]),
showlegend=False
),
row=2, col=1)
fig.update_xaxes(title_text='Parameter', row=1, col=1)
fig.update_yaxes(title_text='Average Reward', row=1, col=1)
fig.update_xaxes(title_text='Parameter', row=2, col=1)
fig.update_yaxes(title_text='% Optimal Action', row=2, col=1)
fig.update_layout(height=800, width=800, title_text='Second half avg rewards', hovermode='x', xaxis_type='log', xaxis2_type='log')
fig.show()
In [ ]: